{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5f6d518-dfe8-4651-b315-44bb55b885be",
   "metadata": {},
   "outputs": [],
   "source": [
    "import intake\n",
    "import pandas as pd\n",
    "import hvplot.pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "718dfddd-14eb-4a4e-ad9c-19f0c9f10f3c",
   "metadata": {},
   "source": [
    "#### Guess reader from existing code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1eb57314-f056-465e-ba75-22d5d0ca232b",
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"s3://mymdtemp/intake_1.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31dde28e-718d-462e-b9bf-30601db948ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(url, storage_options={\"anon\": True}, usecols=[1, 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1daac4ed-e5af-4a63-8a12-d97022e90d99",
   "metadata": {},
   "outputs": [],
   "source": [
    "reader = intake.reader_from_call(_i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bda407ee-f26c-4788-b367-1fc3f370ccbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "reader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3acf782f-b887-4da7-a8b3-78463c52c8eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "reader.kwargs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c718b8bf-0489-4e29-b31c-b354f6373224",
   "metadata": {},
   "source": [
    "#### Or guess from the URL alone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9900e6f4-b896-40ad-bbaf-cb2563ae8252",
   "metadata": {},
   "outputs": [],
   "source": [
    "# uses URL alone, but can also match on magic bytes\n",
    "intake.datatypes.recommend(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "771538c7-1467-4b1c-82ef-a9ace372465c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = intake.readers.datatypes.CSV(url, storage_options={\"anon\": True})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d9bd954-7c31-4638-80bf-ffc4cc7fb452",
   "metadata": {},
   "source": [
    "#### \"What can read this?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4dac3e18-f60a-4b91-946b-36d766a73bfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.possible_outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "654e74d0-aa6a-4211-a04d-4fc6886c58df",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.possible_readers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ddebb3c-745d-4294-9dee-583214075df7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# same reader as original\n",
    "# reader = data.to_reader(\"pandas:DataFrame\")\n",
    "reader = intake.readers.readers.PandasCSV(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e7ae6c4-6db4-4c42-b076-ea4225ba08df",
   "metadata": {},
   "source": [
    "```python\n",
    "class PandasCSV(Pandas):\n",
    "    implements = {datatypes.CSV}\n",
    "    func = \"pandas:read_csv\"\n",
    "    url_arg = \"filepath_or_buffer\"\n",
    "\n",
    "    def discover(self, **kw):\n",
    "        kw[\"nrows\"] = 10\n",
    "        kw.pop(\"skipfooter\", None)\n",
    "        kw.pop(\"chunksize\", None)\n",
    "        return self.read(**kw)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc10228a-ae60-4551-a2a0-81e399d51130",
   "metadata": {},
   "source": [
    "#### Reader API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4d9651d-ef91-4611-b5ae-ee3d762e7196",
   "metadata": {},
   "outputs": [],
   "source": [
    "reader.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da8e7b50-f7f5-4520-ae44-3c2e5d6a91ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(reader.doc())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98112835-41a3-48b5-bc92-7e923c263c38",
   "metadata": {},
   "outputs": [],
   "source": [
    "# known transforms and what they make\n",
    "reader.transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9d0ff3e-1723-4b8a-b549-7ad703c56cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# but have access to full DataFrame API\n",
    "dir(reader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d1fbfc6-70b9-4d84-80f3-fca293d8860f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# or \"pd\" namespace (useful for some packages)\n",
    "reader.pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08249340-d143-4463-b534-6b6ef83f6f79",
   "metadata": {},
   "source": [
    "#### So lets make a catalog and a pipeline using pandas syntax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "843a8a67-5ac8-4128-b8fc-5a897d344578",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = intake.entry.Catalog()\n",
    "cat[\"tute\"] = reader\n",
    "cat[\"capitals\"] = reader.a.str.capitalize()\n",
    "cat[\"inverted\"] = reader.sort_values(\"b\", ascending=False)\n",
    "cat[\"multi\"] = cat.tute.assign(c=cat.capitals)  # <- uses multiple readers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ccf12af-dd3b-429f-be8b-a7fcc749c7a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "reader.a.str.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1eb4728-eda8-467d-a4f0-be280309cc0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# what gets stored in the catalog entry?\n",
    "cat.entries[\"multi\"].kwargs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9393b669-9489-41fb-ad2f-c98970410770",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8a123f6-17cd-404b-acd4-aef832a6e544",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat.tute.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18d0bb78-d902-4689-9586-0ebc1686af32",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat.data  # just one data item"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c2a6379-1bb6-4258-996c-823ffc928eb9",
   "metadata": {},
   "source": [
    "#### To and from catalog file, which you can put anywhere"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca865c7e-a271-4ec9-a44d-efbd68138da0",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat.to_yaml_file(\"intake_1.yaml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b9df0f9-a86a-4b73-9a1f-e355ab5be230",
   "metadata": {},
   "outputs": [],
   "source": [
    "# a \"shared\" one I prepared for everyone\n",
    "cat = intake.from_yaml_file(\"s3://mymdtemp/intake_1.yaml\", anon=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "529a98be-c500-42c4-8b8e-fc8d535d60f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# yes, you have <tab> completion\n",
    "cat.tute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24e83c33-3ba7-4012-ac08-ac3b40521c42",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat.inverted.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3b35a6f-8583-421a-8321-65a62bcf0777",
   "metadata": {},
   "source": [
    "#### And now you can go about your work; but some convenience functions might still be useful."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a27fec42-c46a-442a-9574-83b4d312223f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add arguments to make a reader you can persist\n",
    "cat.inverted.ToHvPlot(explorer=True).read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7b89802-4b83-4587-9103-416ae6423100",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat.inverted.ToMatplotlib.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68c24625-9aab-405a-b2bd-8b6a7a43bb8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# you can even have Intake guess the whole pipeline\n",
    "intake.auto_pipeline(data, \"PNG\", avoid=\"Geo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50504f5b-622c-43f7-ae68-5e71730bc2ab",
   "metadata": {},
   "source": [
    "#### But pandas was not the only engine that can work on this data. We can play with the API or make more readers to persist in the catalog."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c684789c-0038-4f12-8668-640f3f5553c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31a9d94c-5025-4de2-99f8-eb966afa26e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_reader(\"dask\").read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a4a9622-0b1f-4bab-887e-0e64e4a5098e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_reader(\"ray\").read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d481550f-4bda-4864-9b29-c163301b1f36",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dask-on-ray!\n",
    "data.to_reader(\"dask\").DaskToRay.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d00b8b13-6478-4691-b5d8-b76cde9cecd5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
